/* d_fin_analysis_prep.do - ****************************************************

	This file matches the NSC data to SIMS data. We do this using primary
	name and DOB from enrollment files.

******************************************************************************/
clear
set more off

/************************ Data Merge ******************************************/
*Merge in college data
use "$stata_data_analysis/nsc/nsc_ready_fin.dta", replace

*Some dummies to allow for easier subsetting
gen in_K1= gr_asgn == "K1"
keep if in_K1 == 1

*Merge in SIMS panel
merge 1:1 studentno using "$stata_data_sims/SIMS_wide.dta", keep(master match)
gen in_sims = _merge == 3
drop _merge
replace nsc_prim_name_link = 0 if nsc_prim_name_link == .

*Merge in test data
destring sasid, gen(sasid_no)
replace sasid_no = -_n if sasid_no == .
replace sasid = string(sasid_no) if sasid == ""
merge 1:1 sasid_no using "$stata_data/test_wide.dta", gen(test_merge) keep(match master)

/************************* Generate Secondary Instrument ************************/

gen soff_p = soffany - pformany

replace random = random/1000000

/************** The Median Family Income of Assignment Block ******************/

su med_inc_fam, detail

gen inc1 = med_inc_fam < `r(max)' &  med_inc_fam >= `r(p75)' & !missing(med_inc_fam)
gen inc2 = med_inc_fam < `r(p75)' &  med_inc_fam >= `r(p50)' & !missing(med_inc_fam)
gen inc3 = med_inc_fam < `r(p50)' &  med_inc_fam >= `r(p25)' & !missing(med_inc_fam)
gen inc4 = med_inc_fam < `r(p25)' &  med_inc_fam >= `r(min)' & !missing(med_inc_fam)

local qt_vars sch_total_susp4 sch_total_susp8 sch_total_susp10 ///
	sch_truancy4 sch_truancy8 sch_truancy10 sch_std_e4 sch_std_m4 ///
	sch_std_e8 sch_std_m8 sch_std_e10 sch_std_m10

/*********************** Generate School Averages *****************************/


gen sch_mcas4 = sch_std_e4 + sch_std_m4
gen sch_mcas8 = sch_std_e8 + sch_std_m8
gen sch_mcas10 = sch_std_e10 + sch_std_m10

local qt_vars sch_total_susp4 sch_total_susp8 sch_total_susp10 ///
	sch_truancy4 sch_truancy8 sch_truancy10 sch_mcas4 ///
	sch_mcas8 sch_mcas10

foreach var in `qt_vars' {

	su `var', detail

	gen `var'_qt1 = `var' < `r(max)' &  `var' >= `r(p75)' & !missing(`var')
	gen `var'_qt2 = `var' < `r(p75)' &  `var' >= `r(p50)' & !missing(`var')
	gen `var'_qt3 = `var' < `r(p50)' &  `var' >= `r(p25)' & !missing(`var')
	gen `var'_qt4 = `var' < `r(p25)' &  `var' >= `r(min)' & !missing(`var')


}


/************************ Fix up SIMS data ************************************/


*Fix enrollment data
local zero_vars ontime_asgn enrolled
foreach var in `zero_vars' {
forval i = 1/12 {
	su `var'`i'
	replace `var'`i' = 0 if `var'`i' == .

	*Missing data sets for year assignment 1997 and 1998
	if `i' == 1 {
		replace `var'`i' = . if inrange(yr_asgn,1997,1998)
	}
	if `i' == 2 {
		replace `var'`i' = . if inrange(yr_asgn,1997,1997)
	}
}
}


local sim_vars dropout incarcerated anysusp truancy attend transferred num_absences ///
	repeat_grade_asgn days_incarc charter exam
foreach var in `sim_vars' {

	*Calculate total number or ever
	egen num_`var' = rowtotal(`var'? `var'??), missing
	gen ever_`var' = num_`var' > 0 if !missing(num_`var')

	*Calculate total number conditional of having a 9th grade observation
	egen num_`var'_9 = rowtotal(`var'? `var'??) if enstat9 != .
	gen ever_`var'_9 = num_`var' > 0 if !missing(num_`var') & enstat9 != .

	*Calculate high school totals, and hs per year
	egen hs_num_`var'_tot = rowtotal(`var'9 `var'10 `var'11 `var'12), missing
	gen hs_num_`var' = hs_num_`var'_tot/4
	gen hs_ever_`var' = hs_num_`var' > 0 if !missing(hs_num_`var')

	*Condition on having a 9th grade observation
	gen hs_num_`var'_9 = hs_num_`var' if enstat9 != .
	gen hs_num_`var'_tot_9 = hs_num_`var'_tot if enstat9 != .

	gen hs_ever_`var'_9 = hs_ever_`var' if enstat9 != .

	*Middle school level now
	egen mid_num_`var'_tot = rowtotal(`var'5 `var'6 `var'7 `var'8), missing
	gen mid_num_`var' = mid_num_`var'_tot/4
	gen mid_ever_`var' = mid_num_`var' > 0 if !missing(mid_num_`var')

	*Elementary school level now
	egen elem_num_`var'_tot = rowtotal(`var'1 `var'2 `var'3 `var'4), missing
	gen elem_num_`var' = elem_num_`var'_tot/4
	gen elem_ever_`var' = elem_num_`var' > 0 if !missing(elem_num_`var')


}

gen date_PreK_enr = mdy(9,1,yr_asgn)
gen age_PreK_enr = (date_PreK_enr - dob)/365

*Ontime to first year conditional on being observed in 9th grade
gen ontime1_asgn9 = ontime1 if enstat9 != .
order *, sequential

*Average attendance percent
egen ever_attshby = rowmean(attshby*)
egen ever_attshby_9 = rowmean(attshby*) if enstat9 != .

*Clean up High Schools gaduation variables
gen grad_hs = hsgrad12 == 1 | hsgrad11 == 1
gen grad_hs_ontime = ontime_asgn12 & grad_hs

gen grad_ontime_cond = ontime12 if grad_hs == 1
gen grad_hs_9 = grad_hs if enstat9 != .
gen grad_ontime_9 = ontime12 & grad_hs if enstat9 != .
gen grad_ontime_cond_9 = ontime12 if grad_hs == 1 & enstat9 != .

* High school version of behavioral index
	*Generate final behav_index
	gen hs_lg_absence_fin = log(hs_num_num_absences + 1)
	gen hs_lg_susp = log(hs_num_anysusp + 1)
	gen not_grad_hs_ontime = !grad_hs_ontime
	pca hs_lg_absence_fin not_grad_hs_ontime hs_lg_susp ever_incarcerated
	predict behav_index_tot, score

	*Normalize index now
	su behav_index_tot if in_K1 & !inlist(pformany,0,1) & soffany == 0
	local control_mean = r(mean)
	local control_sd = r(sd)
	replace behav_index_tot = (behav_index_tot - `control_mean')/`control_sd'
	replace behav_index_tot = -behav_index_tot

* Middle school version of behavioral index
	*Generate final behav_index
	gen mid_lg_absence_fin = log(mid_num_num_absences + 1)
	gen mid_lg_susp = log(mid_num_anysusp + 1)
	gen grad_ms_ontime = ontime_asgn8
	gen not_grad_ms_ontime = !grad_ms_ontime
	pca mid_lg_absence_fin not_grad_ms_ontime mid_lg_susp mid_ever_incarcerated
	predict ms_behav_index_tot, score

	*Normalize index now
	su ms_behav_index_tot if in_K1 & !inlist(pformany,0,1) & soffany == 0
	local control_mean = r(mean)
	local control_sd = r(sd)
	replace ms_behav_index_tot = (ms_behav_index_tot - `control_mean')/`control_sd'
	replace ms_behav_index_tot = -ms_behav_index_tot

*Generate new version of behav_index
gen hs_lg_absence_fin_9 = log(hs_num_num_absences + 1)
gen hs_lg_susp_9 = log(hs_num_anysusp_9 + 1)
gen not_grad_hs_ontime_9 = !grad_ontime_9
pca hs_lg_absence_fin_9 not_grad_hs_ontime_9 hs_lg_susp_9 ever_incarcerated_9
predict behav_index_tot_9, score

*Normalize index now
su behav_index_tot_9 if in_K1 & !inlist(pformany,0,1) & soffany == 0
local control_mean_9 = r(mean)
local control_sd_9 = r(sd)
replace behav_index_tot_9 = (behav_index_tot_9 - `control_mean_9')/`control_sd_9'
replace behav_index_tot_9 = -behav_index_tot_9

*conduct a unweighted avearage for behavior index
foreach var of varlist hs_num_num_absences hs_num_anysusp ever_incarcerated not_grad_hs_ontime {
	su `var' if in_K1 & !inlist(pformany,0,1) & soffany == 0 & hs_num_num_absences != .
	gen c_`var' = (`var'-r(mean))/r(sd) if `var' != . & hs_num_num_absences != .
}

egen behav_index_unweight = rowmean(c_hs_num_num_absences c_hs_num_anysusp c_ever_incarcerated c_not_grad_hs_ontime) if hs_num_num_absences != .
replace behav_index_unweight = - behav_index_unweight


/************************ Fix up test Data ************************************/

*Determine years that qualify for this analysis.
gen total_mcas = 0
gen total_mcas_e = 0
gen total_mcas_m = 0
forval i = 3/10 {
	replace tooke`i' = 0 if tooke`i' == .
	replace tookm`i' = 0 if tookm`i' == .
	replace total_mcas = total_mcas + tooke`i' + tookm`i'
	replace total_mcas_e = tooke`i' + total_mcas_e
	replace total_mcas_m = tookm`i' + total_mcas_m

	if `i' == 3 {
		replace tookm`i' = . if inrange(yr_asgn,1997,2000)
	}
	if `i' == 5 {
		replace tooke`i' = . if yr_asgn == 1997 | yr_asgn == 1998
		replace tookm`i' = . if yr_asgn == 1997 | yr_asgn == 1998
	}
	if `i' == 6 {
		replace tooke`i' = . if yr_asgn == 1997
	}
	gen a_took_mcas_in_`i' = (tooke`i' == 1 | tookm`i' == 1)
	gen b_took_mcas_in_`i'_sasid = a_took_mcas_in_`i' if in_sims == 1
	gen c_took_mcas_e_in_`i'_sasid = tooke`i' if in_sims == 1
	gen d_took_mcas_m_in_`i'_sasid = tookm`i' if in_sims == 1

}
gen has_sat = sattot != .

*Generate average mcas
egen mcas_avge = rowmean(std_e? std_e??)
egen mcas_avgm = rowmean(std_m? std_m??)

*Condition on having a 10th grade MCAS
gen has_sat_took_10 = has_sat if a_took_mcas_in_10 == 1
replace has_sat_took_10 = . if yr_asgn == 2003

*Fix up some SAT stats
foreach var of varlist qt_* {
	replace `var' = 0 if has_sat_took_10 != . & `var' == . & yr_asgn != 2003
	replace `var' = . if has_sat_took_10 == .
}




/******************* Clean up college variables ***********************************/

*Determine enrollment beginning and ending year
gen yob = year(dob)
gen mob = month(dob)
tostring enrollmentbegin_nsc, replace
tostring enrollmentend_nsc, replace
tostring enrollmentend_nsc_4yr, replace
gen enrollmentbegin_yr = substr(enrollmentbegin_nsc,1,4)
destring enrollmentbegin_yr, replace
gen enrollmentend_yr = substr(enrollmentend_nsc,1,4)
destring enrollmentend_yr, replace
gen enrollmentend_yr_4yr = substr(enrollmentend_nsc_4yr,1,4)
destring enrollmentend_yr_4yr, replace


*Enrolls in any college in certain amounts of time
gen any_6m_asgn = yr_asgn + 14 >= enrollmentbegin_yr
gen any_18m_asgn = yr_asgn + 15 >= enrollmentbegin_yr

*This section determines if someone enrolled in a school 1 year or 2 years after graduation
*we do this by multiplying enrollment with n months with the first school they attend.
local outcomes outofstate_nsc any4yr_nsc any2yr_nsc instate_nsc  private_nsc public_nsc ///
	outofstate_nsc_4yr instate_nsc_4yr private_nsc_4yr public_nsc_4yr
foreach time_period in "6m_asgn" "18m_asgn" "6m" "18m" {
foreach outcome of local outcomes {
	gen	`outcome'_`time_period' = `outcome'*any_`time_period'
}
}

*Fix up the graduation variable to deal with the assignment years 2003
gen grad_or_enrolled_nsc = evergrad_nsc == 1 | still_enroll == 1

replace evergrad_nsc = 0 if evergrad_nsc == .
gen evergrad_nsc_full = evergrad_nsc
replace evergrad_nsc = . if inlist(yr_asgn,2002,2003)

replace evergrad_nsc_4yr = 0 if evergrad_nsc_4yr == .
replace evergrad_nsc_4yr = . if inlist(yr_asgn,2002,2003)

*Graduation on time. First we give someone 4 years if they graduated from a 4 year and 2 years if from a 2 year
gen grad_ontime_asgn = yr_asgn + 18 >= enrollmentend_yr & evergrad_nsc if yr_asgn < 2002 & any4yr_nsc == 1
replace grad_ontime_asgn = yr_asgn + 16 >= enrollmentend_yr & evergrad_nsc if yr_asgn < 2002 & any4yr_nsc == 0

*Graduation on time for 4yr
gen grad_4yr_ontime_asgn_4yr = yr_asgn + 18 >= enrollmentend_yr_4yr & evergrad_nsc_4yr if yr_asgn < 2002
gen grad_6yr_ontime_asgn_4yr = yr_asgn + 20 >= enrollmentend_yr_4yr & evergrad_nsc_4yr if yr_asgn < 2000

* Test variable
gen grad_ontime_asgn_new = grad_ontime_asgn
replace grad_ontime_asgn_new = 1 if grad_4yr_ontime_asgn_4yr == 1

*Final Save
gen not_frpl = frpl == 0 if frpl != .
gen not_frpl_sim = frpl_sim == 0 if frpl_sim != .
replace hs_ever_incarcerated = . if behav_index_tot == .

*Subset
replace nLC = 0 if missing(nLC)
replace ELC = 0 if missing(ELC)

drop frpl
drop not_frpl
rename frpl_sim frpl
rename not_frpl_sim not_frpl

* Deal with few people who disappear from the SIMS for a year or two
replace any_spd1 = 0 if missing(any_spd1) & !missing(ontime_asgn1)
replace any_spd3 = 0 if missing(any_spd3) & !missing(ontime_asgn3)

* For Any Grade 1 SPED, set some years to missing because we don't have full reports
replace any_spd1 = . if missing(ontime_asgn1)
replace any_spd3 = . if missing(ontime_asgn3)


save "$stata_data_analysis/full_samp.dta", replace
keep if !inlist(pformany,0,1)
keep if nsc_prim_name_link == 1
sort *
save "$stata_data_analysis/final_sample_v1.dta", replace
